#!/usr/bin/python


import urllib2
from bs4 import BeautifulSoup
import sqlite3 as sqlite
import os

path = os.getcwd()


def get_page(url):
    try:
        doc = urllib2.urlopen(url).read()
        print 'Get Page', url
        return doc
    except:
        return None

def parse_document(doc):
    """
    return dict: {details:{type, status, priority, resolution, affetcs version, fix version, components, labels, environment},
                description, issue links, activity{comments}}
    """
    if doc is None:
        return None
    soup = BeautifulSoup(doc)
    dict = {}
    dict['project'] = soup.find('a', id="project-name-val").text.strip()
    dict['issueKey'] = soup.find('a', id="key-val").text.strip()
    dict['url'] = soup.find('a', id="key-val")['href']
    details = {}
    try:
        type_val = soup.find("span", id="type-val").text.strip()
        status_val = soup.find("span", id="status-val").text.strip()
        priority_val = soup.find("span", id="priority-val").text.strip()
        resolution_val = soup.find("span", id="resolution-val").text.strip()
        affects_versions = soup.find("span", id="version-val")
        
        fix_versions = soup.find("span", id="fixfor-val").text.strip()
        components = soup.find("span", id="components-val").text.strip()
        environment = soup.find("div", id="environment-val")
        # details['type'] = type_val
        # details['status'] = status_val
        # details['priority'] = priority_val
        # details['resolution'] = resolution_val
        # details['affects'] = affects_versions
        # details['fix'] = fix_versions
        # details['components'] = components
        # details['environment'] = environment
        # dict['details'] = details
        dict['type'] = type_val
        dict['status'] = status_val
        dict['priority'] = priority_val
        dict['resolution'] = resolution_val
        dict['affects'] = affects_versions
        dict['fix'] = fix_versions
        dict['components'] = components
        dict['environment'] = environment
    except:
        dict['details'] = None
        print "Extract detail failed"
    try:
        description = soup.find("div", "user-content-block").text
        dict['description'] = description
    except:
        dict['description'] = None
        print "Extract description failed"

    try:
        people = {}
        assignee = soup.find("span", id="assignee-val").text.strip()
        reporter = soup.find("span", id="reporter-val").text.strip()
        votes = soup.find("span", id="vote-data").text.strip()
        watcher = soup.find("span", id="watcher-data").text.strip()
        # people['assignee'] = assignee
        # people['reporter'] = reporter
        # people['votes'] = votes
        # people['watcher'] = watcher
        # dict['people'] = people

        dict['assignee'] = assignee
        dict['reporter'] = reporter
        dict['votes'] = votes
        dict['watcher'] = watcher
    except:
        dict['people'] = None
        print 'Extract people failed'

    try:
        time = {}
        create = soup.find("span", id="create-date").find("time")['datetime']
        update = soup.find("span", id="updated-date").find("time")['datetime']
        resolved = soup.find("span", id="resolved-date").find("time")['datetime']
        # time['create'] = create
        # time['update'] = update
        # time['resolved'] = resolved
        # dict['time'] = time

        dict['createTime'] = create
        dict['updateTime'] = update
        dict['resolvedTime'] = resolved
    except:
        dict['time'] = None
        print 'Extract Time Failed'

    try:
        links = []
        elements = soup.find("div", id="linkingmodule")
        if not elements:
            print "do not has linking module"
        else:
            elements = elements.findAll('dd')
            for dd in elements:
                link = {}
                link['issue-key'] = dd.find('span').find('a')['data-issue-key']
                link['title'] = dd.find('span')['title']
                links.append(link)
        dict['links'] = links
    except:
        dict['links'] = None
        print 'Extract links Failed'

    try:
        comments = []
        actions = soup.find("div", id="issue_actions_container")
        blocks = actions.findAll("div", "issue-data-block activity-comment twixi-block  expanded")
        for block in blocks:
            user_id = block.find("div", "action-details").find("a").text.strip()
            action_time = block.find("div", "action-details").find("time")['datetime']
            action = block.find("div", "action-details").text.strip()
            action_body = block.find("div", "action-body flooded").text
            a = {"user": user_id, "time": action_time, "action": action, "body": action_body}
            comments.append(a)
        dict['comments'] = comments
    except:
        dict['comments'] = None
        print 'Extract Comments Failed'

    try:
        attachments = []
        elements = soup.find("div", id="attachmentmodule")
        if not elements:
            print 'do not find attachements'
        else:
            elements = elements.find("div", "mod-content").find("li")
            burl = "https://issues.apache.org"
            for item in elements:
                filename = item.find("dt", "attachment-title").text
                fileurl = item.find("div", "attachment-thumb").find("a")['href']
                filecontent = urllib2.urlopen(burl+fileurl).read()
                print 'download attachment'
                attach = {}
                attach['filename'] = filename
                attach['fileurl'] = fileurl
                attach['content'] = filecontent
                attachments.append(attach)
                os.chdir(path+"/patches")
                if os.path.isdir(dict['issueKey']) is False:
                    os.mkdir(dict['issueKey'])
                os.chdir(dict['issueKey'])
                with open(filename, 'w+') as f:
                    f.write(dict['issueKey'])
                f.close()
                os.chdir(path)
                print "create attachment dir and file"                

        dict['attachments'] = attachments
    except:
        dict['attachments'] = None
        print 'Extract Attachments Failed'

    return dict



import time
def delay():
    time.sleep(2)

def generate_urls():
    i = 376
    urls = []
    while i <= 4662:
        url = 'https://issues.apache.org/jira/browse/SVN-%d' % i
        urls.append(url)
        i += 1
    return urls

import redis
def main():
    r = redis.Redis()
    urls = generate_urls()
    for url in urls:
        doc = get_page(url)
        if not doc:
            continue
        dict = parse_document(doc)
        key = dict['issueKey']
        keys = dict.keys()
        keys.sort()
        for k in keys:
            r.hset(key, k, dict[k])
        print key, 'extracted'
        #delay()

if __name__ == '__main__':
    main()




